

*Clean downloaded data to create datasets

*Information on student characteristics 
use "I:\MAA2015-20 Credit and Education\data\course07_17.dta", clear
drop if moe_crs_year_nbr<2009
preserve
egen nzsced = mode(moe_crs_qual_nzsced_code),by(snz_uid moe_crs_year_nbr)
egen provider=mode(moe_crs_provider_code),by(snz_uid moe_crs_year_nbr)
egen firstyr=mode(moe_crs_first_tertiary_year_nbr),by(snz_uid moe_crs_year_nbr)

collapse provider  nzsced firstyr (max) moe_crs_birth_year_nbr moe_crs_last_school_decile_code moe_crs_disability_code moe_crs_sex_snz_code moe_crs_ethnic1_snz_code,by( snz_uid moe_crs_year_nbr)
gen year= moe_crs_year_nbr
compress*
save "demo.dta",replace
restore

*Infotmation on NCEA level 3 
use "I:\MAA2015-20 Credit and Education\data\percent.dta", clear
collapse (max) moe_sps_expected_percentil3_nbr,by(snz_uid snz_moe_uid)
gen percentile= moe_sps_expected_percentil3_nbr if moe_sps_expected_percentil3_nbr>-1
drop moe_sps_expected_percentil3_nbr
compress*
save "percentscore.dta",replace

*Bachelor's degree completion; we take the highest degree (Bachelor's and above) in the earliest year
use  "I:\MAA2015-20 Credit and Education\data\complete.dta",clear
gen year=moe_com_year_nbr
drop if moe_com_year_nbr<2009
keep snz_uid snz_moe_uid year moe_com_qual_code moe_com_provider_code moe_com_qual_nzsced_code moe_com_qacc_code  
compress*
gen qualba_complete=(moe_com_qacc_code<=20)
keep if qualba_complete==1
sort snz_uid year moe_com_qacc_code
egen highest=min( moe_com_qacc_code),by(snz_uid year)
drop if highest!= moe_com_qacc_code
compress*
drop highest qualba_complete
duplicates drop snz_uid year,force
egen min=min(year),by(snz_uid)
drop if min!=year
gen ba_year=year
rename moe_com_provider_code ba_provider
rename moe_com_qual_code ba_qual
rename moe_com_qual_nzsced_code ba_nzsced
rename moe_com_qacc_code ba_qacc
drop min
compress*
save "ba_year.dta",replace

* Information on EFTS taken in 2008
use "I:\MAA2015-20 Credit and Education\data\course07_17.dta", clear
drop if moe_crs_year_nbr>=2010
gen end_year=year(moe_crs_end_date)
gen start_year=year(moe_crs_start_date)
collapse (max) moe_crs_complete_code  moe_crs_qual_type_code  moe_crs_efts_course_factor_nbr end_year    moe_crs_start_date moe_crs_withdrawal_date,by(snz_uid snz_moe_uid moe_crs_end_date moe_crs_course_code moe_crs_provider_code  moe_crs_qual_code)
drop if end_year>=2009
gen     completion=(moe_crs_complete_code==2| moe_crs_complete_code==3| moe_crs_complete_code==4)
replace completion= 0 if (moe_crs_withdrawal_date-moe_crs_start_date )<=30
compress*
keep if moe_crs_qual_type_code ==3 
drop if end_year==2007 
gen pass=(moe_crs_complete_code==2)      if completion==1            
gen efts08=  completion*moe_crs_efts_course_factor_nbr
gen efts_pass08 = pass*efts08    if completion==1 
collapse  (sum) efts08 efts_pass08  ,by(snz_uid snz_moe_uid)
compress*
save "efts_08.dta",replace

*Inforamtion on student loan borrowed and allowance received each year 
use "I:\MAA2015-20 Credit and Education\data\borrow00_17.dta", clear
gen loan_amt=msd_sla_ann_drawn_fee_amt+ msd_sla_ann_drawn_living_cst_mt+ msd_sla_ann_admin_fee_amt+ msd_sla_ann_drawn_course_rel_mt
drop if msd_sla_year_nbr<2009
gen year = msd_sla_year_nbr
keep snz_uid snz_ird_uid year loan_amt msd_sla_balance_amt msd_sla_sa_study_start_date msd_sla_sa_study_end_date msd_sla_sl_study_status_code msd_sla_ann_allowance_paid_amt  msd_sla_ann_accommodation_amt 
sort snz_uid year loan_amt
duplicates drop snz_uid year,force
compress*
save "loan.dta", replace

*Get course information and then caculate accumualated pass rate
*As a small proportion of courses can be across calander year, we collapse the data using course end date so the data is uniquely identified by individual, course, and year.
use "I:\MAA2015-20 Credit and Education\data\course07_17.dta", clear
drop if moe_crs_year_nbr<2009
gen ba_course= (moe_crs_qacc_code==20|moe_crs_qacc_code==12)
egen ba_enrol=max(ba_course),by( snz_uid)
keep if ba_enrol==1
gen end_year=year(moe_crs_end_date)
gen start_year=year(moe_crs_start_date)
gen cross= (end_year!= start_year)
collapse (max) moe_crs_complete_code  moe_crs_qual_type_code ba_course moe_crs_efts_course_factor_nbr  moe_crs_first_tertiary_year_nbr  end_year start_year moe_crs_qacc_code moe_crs_start_date moe_crs_withdrawal_date cross  moe_crs_study_type_code,by(snz_uid snz_moe_uid moe_crs_end_date moe_crs_course_code moe_crs_provider_code  moe_crs_qual_code)
drop if end_year>2017
gen     completion=(moe_crs_complete_code==2| moe_crs_complete_code==3| moe_crs_complete_code==4)
replace completion= 0 if (moe_crs_withdrawal_date-moe_crs_start_date )<=30
*As suggested in the Single Data Return manual provided by the Ministry of Education, we exclude any courses indicating a withdrawal date within 30 days of the course start date from the EFTS calculations.
compress*
save "course_endyear.dta", replace

*Then, we caculate EFTS taken and passed per year
use "course_endyear.dta", clear
keep if moe_crs_qual_type_code ==3
*keep only formal courses (more than one week long and greater than 0.03efts)
drop if end_year<2009 
egen firstyr=mode(moe_crs_first_tertiary_year_nbr),by(snz_uid)
egen provider=mode(moe_crs_provider_code),by(start_year snz_uid)
replace start_year=2009 if start_year<2009&end_year>=2009
gen pass=(moe_crs_complete_code==2)      if completion==1            
gen efts=  completion*moe_crs_efts_course_factor_nbr
gen efts_pass_startyear = pass*efts    if completion==1 
collapse (max) provider ba_course firstyr (sum)efts efts_pass_startyear moe_crs_efts_course_factor_nbr (min)moe_crs_study_type_code ,by(snz_uid snz_moe_uid start_year)
gen year= start_year
sort snz_uid year
compress*
save "efts_startyear.dta",replace

* create a individual-year panel for students 
use "efts_startyear.dta",clear
collapse  snz_moe_uid, by(snz_uid )
compress*
gen year=2009
forv i=1(1)8 {
gen year`i'=year+`i'
}
rename year year0
reshape long year, i(snz_uid snz_moe_uid) j(tertiary_year)
drop if year>2017
compress*
save "bal.dta",replace
* merge information on student loan and efts 
merge m:1 snz_uid year using "loan.dta"
drop if _merge==2
*egen maxloan=max(loan_amt),by(snz_uid)
*drop if maxloan<100|maxloan==.
drop _merge
merge m:1 snz_uid year using "efts_startyear.dta"
drop if _merge==2
drop  msd_sla_balance_amt start_year _merge 
sort snz_uid year
by snz_uid: gen efts_acc=sum(efts[_n])
by snz_uid: egen ind=seq() if efts_acc>=1.6
* indicator for 1.6 efts accumulated
by snz_uid: gen efts_pass_acc=sum(efts_pass_startyear[_n])
gen ratio_pass= efts_pass_acc/ efts_acc
* create running variable pass rate
keep if ind >=1&ind<.
drop if year==2017 & ind==1
*our last year of data is 2017
drop if year==2009&ind==1
*drop those who complete 1.6efts in 2009 as the rule is effective in 2011 not 2010

sort snz_uid ind
by snz_uid: gen      enrolba=(efts[1]<.&efts[1]>=0.1&ba_course==1)              if year>=2010&ind==1
*enroll in any bachelor's course in the year before performance assessment (year t)

by snz_uid: gen      loaner=(year[1]!=. & loan_amt [1]>=100& loan_amt [1]<.)    if year>=2010&ind==1
by snz_uid: replace  loaner= 0 if (efts[1]==.|efts[1]<0.1)& year>=2010&ind==1
*To be classified as having a student loan in yaer t, a student needs to have a loan amount of at leat $100 and take no less than 0.1 EFTS

by snz_uid: gen      loan=  (year[2]!=. & loan_amt [2]>=100& loan_amt [2]<.)    if year>=2010&ind==1
by snz_uid: replace  loan= 0 if (efts[2]==.|efts[2]<0.1)& year>=2010&ind==1
*have a student loan in year after performance assessment (year t + 1)
by snz_uid: gen      allower=(year[1]!=. & msd_sla_ann_allowance_paid_amt [1]>=100& msd_sla_ann_allowance_paid_amt [1]<.)    if year>=2010&ind==1
by snz_uid: replace  allower= 0 if (efts[1]==.|efts[1]<0.1)& year>=2010&ind==1
*students with student allowance in year t
by snz_uid: gen      allow=  (year[2]!=. & msd_sla_ann_allowance_paid_amt [2]>=100& msd_sla_ann_allowance_paid_amt [2]<.)    if year>=2010&ind==1
by snz_uid: replace  allow= 0 if (efts[2]==.|efts[2]<0.1)& year>=2010&ind==1
*have a student allowance in year t +1
by snz_uid: gen      enrol  =(efts[2]<.&efts[2]>=0.1)                           if year>=2010&ind==1
*re-enrol in tertiary study in year t + 1
  
egen indyear=min(year),by(snz_uid) 
gen  indyr  =2017-indyear


*Below are additional codes for generating inforamtion on enrollment, loan status, efts taken and passed, full-time enrollment in each year before and after perforamnce assessment (year t, t+1, t+2,... t+7)
/*
forv i==0(1)7 {
local k =`i' +1
by snz_uid: gen loan_post`i'yr=(year[`k']!=.&loan_amt[`k']>=100&loan_amt[`k']<.) if year>=2010&ind==1&`i'<=indyr
by snz_uid: replace loan_post`i'yr= 0 if (efts[`k']==.|efts[`k']<0.1)&year>=2010&ind==1&`i'<=indyr
by snz_uid: gen enrol_post`i'yr=(efts[`k']<.&efts[`k']>=0.1) if year>=2010&ind==1&`i'<=indyr
by snz_uid: gen enrolba_post`i'yr=(enrol_post`i'yr==1&ba_course[`k']==1) if year>=2010&ind==1&`i'<=indyr
by snz_uid: gen enrolefts_post`i'yr=efts[`k'] if year>=2010&ind==1&`i'<=indyr 
by snz_uid: gen enrolefts_pass_post`i'yr=efts_pass_startyear[`k'] if year>=2010&ind==1&`i'<=indyr 
by snz_uid: gen enrolefts_acc_post`i'yr =efts_acc_post[`k'] if year>=2010&ind==1&`i'<=indyr
by snz_uid: gen      enrolefts_pass_acc_post`i'yr=efts_pass_acc_post[`k'] if year>=2010&ind==1&`i'<=indyr

}

forv i==0(1)7 {     
by snz_uid: replace enrol_post`i'yr               =0 if enrol_post`i'yr          ==.&`i'<=indyr
by snz_uid: replace enrolba_post`i'yr             =0 if enrolba_post`i'yr        ==.&`i'<=indyr
by snz_uid: replace enrolefts_post`i'yr           =0 if enrolefts_post`i'yr      ==.&`i'<=indyr
by snz_uid: replace enrolefts_pass_post`i'yr      =0 if enrolefts_pass_post`i'yr ==.&`i'<=indyr
by snz_uid: replace enrolefts_acc_post`i'yr       =0 if enrolefts_acc_post`i'yr      ==.&`i'<=indyr
by snz_uid: replace enrolefts_pass_acc_post`i'yr  =0 if enrolefts_pass_acc_post`i'yr ==.&`i'<=indyr

}
*/
compress*
save  "sample.dta",replace

*merge the sample with student characterisitics 
use "sample.dta",clear
merge m:1 snz_uid year using "demo.dta"
drop if _merge==2
drop _merge
gen age = year- moe_crs_birth_year_nbr if moe_crs_birth_year_nbr!=.
gen white =(moe_crs_ethnic1_snz_code <=129) if moe_crs_ethnic1_snz_code!=.
gen maori =(moe_crs_ethnic1_snz_code>=211& moe_crs_ethnic1_snz_code <=371)  if moe_crs_ethnic1_snz_code!=.
gen disable =(moe_crs_disability_code==2) if moe_crs_disability_code!=.
gen female =(moe_crs_sex_snz_code==2) if moe_crs_sex_snz_code!=.
gen fullyr=( moe_crs_study_type_code==1) if moe_crs_study_type_code!=.
gen uni=(provider>=7001&provider<=7008) if provider!=.
gen sced=int(nzsced/10000)
gen science=(sced>=1&sced<=3) if sced!=.
gen business=(sced==8)        if sced!=.
rename moe_crs_last_school_decile_code  decile
gen treat=(ratio_pass>=0.5)
gen ratio= ratio_pass-0.5
gen treat_ratio=treat*ratio
set matsize 10000

*generate a sample without missing data in student characteristics
reg enrol treat ratio_pass female age maori disable decile i.moe_crs_study_type_code business science i.provider   i.firstyr  i.year  if ind==1 
gen basesample=e(sample)
keep if basesample==1
drop basesample
keep if firstyr>=2005
rename firstyr cohort
xi i.cohort
rename _Icohort_# cohort#
gen provider1=provider
replace provider1=7000 if provider1<7001|provider1>7008
xi i.provider1
rename _Iprovider1_# provider#
drop provider1 provider7006
* drop dummy for the samllest university as it causes perfect collinearity due to small number of obs
xi i.year
rename _Iyear_# year#

merge m:1 snz_uid  using "percentscore.dta"
drop if _merge==2
drop _merge
merge m:m snz_uid using "efts_08.dta"
drop if _merge==2
replace efts_pass08=0 if efts_pass08==.
replace efts08=0 if efts08==.
drop _merge
*merge with bachelor's degree completion
merge m:m snz_uid using "ba_year.dta"
drop if _merge==2
drop _merge
gen indba=ba_year-year  
gen ba_post1yr=(indba<=1)
gen ba_post2yr=(indba<=2)
gen ba_post3yr=(indba<=3)
gen ba_post4yr=(indba<=4)
gen ba_post5yr=(indba<=5)
gen ba_post6yr=(indba<=6)
gen ba_post7yr=(indba<=7)
gen  yr= ba_year - cohort+1
replace yr=. if ba_year<year
replace yr=3 if yr<=3
gen ba_4yr=(yr<=4)
gen ba_5yr=(yr<=5)
gen ba_6yr=(yr<=6)
gen ba_7yr=(yr<=7)
replace tertiary_year =year-cohort +1

drop msd_sla_ann_allowance_paid_amt- msd_sla_sl_study_status_code moe_crs_disability_code moe_crs_birth_year_nbr moe_crs_sex_snz_code moe_crs_ethnic1_snz_code ind yr efts efts_pass_startyear moe_crs_efts_course_factor_nbr moe_crs_study_type_code moe_crs_year_nbr nzsced indyear ba_provider- ba_qacc ba_course cohort2006 cohort2007 cohort2012-cohort2016 year2013-year2016 indba

*create analysis sample
keep if cohort>=2008&cohort<=2011&age<24
keep if loaner==1
keep if enrolba==1
keep if ba_year>year
keep if indyr>=5
keep if efts_pass08<1&efts_pass_acc<3
compress*
save "basesample.dta",replace

use"basesample.dta",clear
duplicates drop snz_uid,force
keep snz_uid 
compress*
save "uid_final.dta",replace


* data for studnet loan balance
use "I:\MAA2015-20 Credit and Education\data\IRD_loan.dta" ,clear
merge m:m snz_uid using "uid_final.dta"
drop if _merge ==1
save "loan_fin.dta",replace
*use "loan_fin.dta" ,clear
set more off
replace ir_fin_return_year_nbr=ir_fin_return_year_nbr-1
*For example, 2012 balance is the balance on March 31 2012, the amount borrow in 2011 will be in 2012. So I subtract one year to make the loan balance reflect borrowing in the previous year
drop if ir_fin_return_year_nbr<2010
keep snz_uid ir_fin_return_year_nbr ir_fin_loan_bal_process_date_mt ir_fin_period_payment_amt ir_fin_tot_payment_amt
*use ir_fin_loan_bal_process_date_mt instead of the one based on effecttive date because it is what the IRD reported and borrowers see from their account 
duplicates tag snz_uid ir_fin_return_year_nbr,gen(tag)
tab ir_fin_loan_bal_process_date_mt if tag==1
collapse(max) ir_fin_loan_bal_process_date_mt ir_fin_period_payment_amt ir_fin_tot_payment_amt,by(snz_uid ir_fin_return_year_nbr)
drop if ir_fin_return_year_nbr==.
rename (ir_fin_loan_bal_process_date_mt ir_fin_return_year_nbr ir_fin_period_payment_amt ir_fin_tot_payment_amt) (loan_bal year payment tot_payment)
reshape wide loan_bal payment tot_payment, i(snz_uid) j( year)
drop payment* tot_payment*
save "loan_fin_w.dta",replace

* data for earnings, month with positive earnings, earnings percentile, month with earnings above full-time minimum wage
forv i==10(1)17 {
use "I:\MAA2015-20 Credit and Education\data\IRD_inc_`i'.dta" ,clear
foreach j in "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12"{
gen du_was_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>0&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
if `i'==10 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2040&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 else if `i'==11 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2080&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 else if `i'==12 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2160&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 else if `i'==13 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2200&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 else if `i'==14 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2280&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 else if `i'==15 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2360&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 else if `i'==16 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2440&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 else if `i'==17 {
gen du_minwage_`j'=(inc_cal_yr_sum_WAS_mth_`j'_amt>2520&inc_cal_yr_sum_WAS_mth_`j'_amt<.)
 }
 }
gen sum_du_was=du_was_01+du_was_02+du_was_03+du_was_04+du_was_05+du_was_06+du_was_07+du_was_08+du_was_09+du_was_10+du_was_11+du_was_12
gen sum_du_minwage=du_minwage_01+du_minwage_02+du_minwage_03+du_minwage_04+du_minwage_05+du_minwage_06+du_minwage_07+du_minwage_08+du_minwage_09+du_minwage_10+du_minwage_11+du_minwage_12
gen avg_was=inc_cal_yr_sum_WAS_tot_amt/ sum_du_was
keep inc_cal_yr_sum_year_nbr snz_uid sum_* avg_*
compress*
merge m:m snz_uid using "I:\MAA2015-20 Credit and Education\data\demo_all.dta"
keep if _merge==3
drop _merge
bysort snz_sex_code snz_birth_year_nbr:egen rank= rank(avg_was) if avg_was>0&avg_was<.,unique
*use "unique" option to ensure we have exact 100 percentiles
bysort snz_sex_code snz_birth_year_nbr:egen maxrank= max(rank) if avg_was>0&avg_was<.
gen percentile= rank/ maxrank if avg_was>0&avg_was<.
replace percentile=percentile*100 if avg_was>0&avg_was<.
gen was_percent=ceil(percentile) if avg_was>0&avg_was<.
replace was_percent=0 if avg_was==0
replace was_percent=0 if avg_was==.
merge m:m snz_uid using "uid_final.dta"
keep if _merge==3
drop _merge
drop link_set_key- snz_ethnicity_source_code rank maxrank percentile
compress*
save "avg_inc20`i'.dta",replace
}
use "avg_inc2010.dta",clear
forv i==11(1)17 {
append using "avg_inc20`i'.dta"
}
save "avg_inc.dta",replace
reshape wide sum_du_was sum_du_minwage avg_was was_percent,i( snz_uid) j( inc_cal_yr_sum_year_nbr )
compress*
save "avg_inc_w.dta",replace

use "basesample",clear
merge m:m snz_uid using "avg_inc_w.dta"
drop if _merge==2
drop _merge
merge m:m snz_uid using "loan_fin_w.dta"
drop if _merge==2
drop _merge



forv i==2010(1)2017 {

replace loan_bal`i'     =0 if loan_bal`i'==.
replace avg_was`i'      =0 if avg_was`i'==.
replace was_percent`i'  =0 if was_percent`i'==.
replace sum_du_was`i'   = 0 if sum_du_was`i' ==.
replace sum_du_minwage`i' = 0 if sum_du_minwage`i' ==.


}

forv i==0(1)7 {
gen     loan_bal_post`i'yr=.
gen     avg_was_post`i'yr=.
gen     was_percent_post`i'yr=.
gen     sum_du_was_post`i'yr= .
gen     sum_du_minwage_post`i'yr= .

}

forv i==0(1)7 {
forv j==2010(1)2016{
local k = `j' + `i' 
if `k' <=2017 {
replace     loan_bal_post`i'yr=loan_bal`k'                            if year==`j' 
replace     avg_was_post`i'yr= avg_was`k'                             if year==`j'  
replace     was_percent_post`i'yr= was_percent`k'                     if year==`j' 
replace     sum_du_was_post`i'yr= sum_du_was`k'                       if year==`j'
replace     sum_du_minwage_post`i'yr= sum_du_minwage`k'               if year==`j' 

}
}
}
drop sum_du_was2010-loan_bal2017
compress*
save "basesample_earn.dta",replace

